R Markdown Plot email Exploring variables

getwd()
## [1] "/Users/pankaj/dev/git/smu/qtw/case_study3"
setwd("/Users/pankaj/dev/git/smu/qtw/case_study3/Week_5_Materials_2")
load("data.Rda") 

length(emailDFrp)
## [1] 30
plot(colSums(is.na(emailDFrp)))

df1= emailDFrp

df1$id = seq(1, length(emailDFrp$isRe))


df1 %>% 
  ggplot(mapping = aes(x= isRe, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= underscore, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= priority, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isInReplyTo, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= sortedRec, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= subPunc, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= multipartText, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= hasImages, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isPGPsigned, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= subSpamWords, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= noHost, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= numEnd, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isYelling, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isOrigMsg, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isDear, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= isWrote, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= subExcCt, fill = isSpam))+
  geom_bar()
## Warning: Removed 20 rows containing non-finite values (stat_count).

df1 %>% 
  ggplot(mapping = aes(x= subQuesCt, fill = isSpam))+
  geom_bar()
## Warning: Removed 20 rows containing non-finite values (stat_count).

df1 %>% 
  ggplot(mapping = aes(x= numLines, fill = isSpam))+
  geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

df1 %>% 
  ggplot(mapping = aes(x= numDlr, fill = isSpam))+
  geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

df1 %>% 
  ggplot(mapping = aes(x= numAtt, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= numRec, fill = isSpam))+
  geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
## Warning: Removed 282 rows containing non-finite values (stat_bin).

df1 %>% 
  ggplot(mapping = aes(x= hour, fill = isSpam))+
  geom_bar()

df1 %>% 
  ggplot(mapping = aes(x= perHTML, fill = isSpam))+
  geom_bar()
## Warning: position_stack requires non-overlapping x intervals

df1 %>% 
  ggplot(mapping = aes(x= forwards ,    color = isSpam))+
  geom_freqpoly( binwidth= 1)

df1 %>% 
  ggplot(mapping = aes(x= perCaps ,    color = isSpam))+
  geom_freqpoly(binwidth= 1 )

df1 %>% 
  ggplot(mapping = aes(x= bodyCharCt ,    color = isSpam))+
  geom_freqpoly( binwidth= 100)

df1 %>% 
  ggplot(mapping = aes(x= avgWordLen ,    color = isSpam))+
  geom_freqpoly(binwidth= 1 )

df1 %>% 
  ggplot(mapping = aes(x= subBlanks ,    color = isSpam))+
  geom_freqpoly( binwidth= 1)
## Warning: Removed 20 rows containing non-finite values (stat_bin).

We cab see from above graphs there are some variables that dominate one type of messages more than others. For example variables isInReply to and priority have just one value in Spam messages which means that they might be useful in splitting trees in tree based model.Other attributes like subPunk and multipartText are almost equally present in both types of messages. Similarly if we see isPGsigned True then the message are almost always non spam.

Tree Model

Random Tree Model with 2 variables

## Confusion Matrix and Statistics
## 
##    pred.rpint1
##        F    T
##   F 3489    3
##   T 1122   60
##                                           
##                Accuracy : 0.7593          
##                  95% CI : (0.7468, 0.7715)
##     No Information Rate : 0.9865          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0727          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.75667         
##             Specificity : 0.95238         
##          Pos Pred Value : 0.99914         
##          Neg Pred Value : 0.05076         
##              Prevalence : 0.98652         
##          Detection Rate : 0.74647         
##    Detection Prevalence : 0.74711         
##       Balanced Accuracy : 0.85452         
##                                           
##        'Positive' Class : F               
## 

Grid search for tree model

We can see that accuracy of above random model is not very high so we do a grid search on all variables and different tree hights to figure out which tree will be the best suited and where we ll get optmimal results without overfitting the model.

train.control <- trainControl(
  method = "repeatedcv",
  number = 10, ## 10-fold CV
  repeats = 3,## repeated three times
  # USE AUC
  summaryFunction = twoClassSummary, 
  classProbs = TRUE
)

tune.gridcart <- expand.grid(maxdepth = 2:15)

#tune.gridcart <- expand.grid(cp = seq(2,10))


system.time (rpartFit2 <- train(isSpam ~ . ,                         
                                data = emailDFrp,
                                method = "rpart2", 
                                tuneGrid =tune.gridcart,
                                trControl = train.control,
                                metric = "ROC", na.action = na.pass
))
##    user  system elapsed 
##  30.890   1.213  12.861
rpartFit2 
## CART 
## 
## 9348 samples
##   29 predictor
##    2 classes: 'F', 'T' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 8413, 8414, 8414, 8413, 8414, 8412, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  ROC        Sens       Spec     
##    2        0.7585786  0.9246642  0.5878144
##    3        0.7761748  0.9501753  0.5849152
##    4        0.7789992  0.9683019  0.5762930
##    5        0.8001320  0.9705070  0.5967329
##    6        0.9161483  0.9513262  0.7352371
##    7        0.9161483  0.9513262  0.7352371
##    8        0.9164148  0.9510385  0.7366318
##    9        0.9177516  0.9530529  0.7377586
##   10        0.9187233  0.9554506  0.7377586
##   11        0.9255864  0.9471549  0.7707078
##   12        0.9289536  0.9450444  0.7860094
##   13        0.9348707  0.9428388  0.8093579
##   14        0.9363542  0.9450928  0.8128382
##   15        0.9371638  0.9479696  0.8149262
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 15.
plot(rpartFit2)

From above grid search it looks like ROC curve area increases with number of depth but the increase is not much after max depth 6. so that one we ll choose for our final model

Tree Model with all variables and maxdepth determined by grid search.

emailDFrp.rp <-rpart(isSpam ~ . ,                         # formula
                     data = emailDFrp,                       # data frame
                     subset = train,                       # indices of training set
                     method = "class",                     # classification tree
                     parms = list(split = "information"),  # use entropy/deviance
                     maxsurrogate = 0,                     # since no missing values
                     cp = 0,                               # no size penalty
                     minsplit = 5,                         # Nodes of size 5 (or # morecan be split,
                     minbucket = 2,
                     # provided each sub-node
                     # contains at least 2 obs.
                     control=rpart.control(maxdepth=6)
)


#summary(emailDFrp.rp)

plot(emailDFrp.rp, 
     uniform=TRUE,
     compress=TRUE,
     margin = .2)
text(emailDFrp.rp, 
     use.n=TRUE, 
     all = TRUE,
     fancy = TRUE)

fancyRpartPlot(emailDFrp.rp, main="Decision Tree Graph")

Plot and analysis of the paths through tree

As per the above we plot we can see first split is on variable forward. if the value is more than 5.5 another split is done on variable perCaps. We can see that 53% of total mails are in node with forward >6 and perCaps less than 6. This contains 73% of non spams and 27 % of smaps. We split this node further based on subblanks less than 24. Other nodes can be explained in same way and finally we reach leaf node.

Evaluation of Model

confusion matrix for model based on all vars and max tree height 6.

pred.rpint <- predict(emailDFrp.rp,
                       newdata = emailDFrp[-train,],
                       type = "class")


#
#  Classification table on the test data
#

cm = table(emailDFrp$isSpam[-train], pred.rpint)

print(confusionMatrix(cm))
## Confusion Matrix and Statistics
## 
##    pred.rpint
##        F    T
##   F 3387  105
##   T  490  692
##                                           
##                Accuracy : 0.8727          
##                  95% CI : (0.8628, 0.8821)
##     No Information Rate : 0.8295          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6224          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8736          
##             Specificity : 0.8683          
##          Pos Pred Value : 0.9699          
##          Neg Pred Value : 0.5854          
##              Prevalence : 0.8295          
##          Detection Rate : 0.7246          
##    Detection Prevalence : 0.7471          
##       Balanced Accuracy : 0.8709          
##                                           
##        'Positive' Class : F               
## 

As we can see the accurace of this model is 87% on test data.

H2O random forest model

We use random forest from h2o lib to generate other tree.

#knitr::opts_chunk$set(fig.width=12, fig.height=8) 

# Build and train the model:

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         8 minutes 58 seconds 
##     H2O cluster timezone:       Asia/Singapore 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.30.0.1 
##     H2O cluster version age:    2 months and 13 days  
##     H2O cluster name:           H2O_started_from_R_pankaj_xen810 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.38 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.0 (2019-04-26)
emailDHex = as.h2o(emailDFrp)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
splits = h2o.splitFrame(data = emailDHex, ratios = .8, seed = 1234)
trainHex = splits[[1]]
validHex = splits[[2]]
predictors = c("isRe")
predictors = names(emailDFrp)[c(seq(2,29))]
response = c("isSpam")


emailD_2tree <- h2o.randomForest(x = predictors,
                                  y = response,
                                  ntrees = 5,
                                  max_depth = 5,
                                  min_rows = 20,
                                  binomial_double_trees = TRUE,
                                  training_frame = trainHex,
                                  validation_frame = validHex)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=================================================================| 100%
emailDH2oTree2 = h2o.getModelTree(model = emailD_2tree, tree_number = 2)

library(data.tree)


library(DiagrammeR)

emailDataTree = createDataTree(emailDH2oTree2)

GetEdgeLabel <- function(node) {return (node$edgeLabel)}
GetNodeShape <- function(node) {switch(node$type, 
                                       split = "diamond", leaf = "oval")}
GetFontName <- function(node) {switch(node$type, 
                                      split = 'Palatino-bold', 
                                      leaf = 'Palatino')}
SetEdgeStyle(emailDataTree, fontname = 'Palatino-italic', 
             label = GetEdgeLabel, labelfloat = TRUE,
             fontsize = "26", fontcolor='royalblue4')
SetNodeStyle(emailDataTree, fontname = GetFontName, shape = GetNodeShape, 
             fontsize = "26", fontcolor='royalblue4',
             height="1.75", width="2")

SetGraphStyle(emailDataTree, rankdir = "LR", dpi=70.)
dev.new(width=10, height=10)

plot(emailDataTree, output = "graph")
emailDataTree
##                                     levelName
## 1  isYelling                                 
## 2   ¦--hour                                  
## 3   ¦   ¦--prediction: 0                     
## 4   ¦   °--bodyCharCt                        
## 5   ¦       ¦--prediction: 0.14285715        
## 6   ¦       °--prediction: 0.03846154        
## 7   °--isInReplyTo                           
## 8       ¦--perHTML                           
## 9       ¦   ¦--numEnd                        
## 10      ¦   ¦   ¦--forwards                  
## 11      ¦   ¦   ¦   ¦--prediction: 0.20247933
## 12      ¦   ¦   ¦   °--prediction: 0.9230769 
## 13      ¦   ¦   °--multipartText             
## 14      ¦   ¦       ¦--prediction: 0.28787878
## 15      ¦   ¦       °--prediction: 0.8250444 
## 16      ¦   °--subBlanks                     
## 17      ¦       ¦--subBlanks                 
## 18      ¦       ¦   ¦--prediction: 0.03508772
## 19      ¦       ¦   °--prediction: 0.3508772 
## 20      ¦       °--perHTML                   
## 21      ¦           ¦--prediction: 0         
## 22      ¦           °--prediction: 0.12903225
## 23      °--prediction: 1
h2o.shutdown()
## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?

We found almost similar variables in slightly different order as we found in our initial tree model.